Chapter 3 Data quality

3.1 Load the data that were prepared in the previous chapter.

load("data/data.Rdata")

3.2 Data loading

3.2.1 Define lists with the names of the files required for the statistics.

The files were produced by the bioinformatics pipeline, and located in 3D’omics ERDA.

3.2.1.1 General sequencing statistics of sequencing.

Used the multiqc_fastqc.txt because the multiqc_general_stats.txt did not exist in all batches.

multiqc_fastqc_list <- c(
  "https://sid.erda.dk/share_redirect/G2guEHWh9v/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0006
  "https://sid.erda.dk/share_redirect/HiPNk7p4MG/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0009
  "https://sid.erda.dk/share_redirect/cdU6P6sNuj/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0010
  "https://sid.erda.dk/share_redirect/EUKYidpvOO/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0011
  "https://sid.erda.dk/share_redirect/dEy2D1OmZi/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0012
  "https://sid.erda.dk/share_redirect/B0E8AbA7Eu/reports/by_step/reads_data/multiqc_fastqc.txt", # MSEB0014
  "https://sid.erda.dk/share_redirect/hT3CftfSyw/reports/by_step/reads_data/multiqc_fastqc.txt" # MSEB0015
)

3.2.1.2 General sequencing statistics of sequencing after trimming.

multiqc_fastqc_trimmed_list <- list(
  list(file = "https://sid.erda.dk/share_redirect/G2guEHWh9v/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0006
  list(file = "https://sid.erda.dk/share_redirect/HiPNk7p4MG/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0009
  list(file = "https://sid.erda.dk/share_redirect/cdU6P6sNuj/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0010
  list(file = "https://sid.erda.dk/share_redirect/EUKYidpvOO/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0011
  list(file = "https://sid.erda.dk/share_redirect/dEy2D1OmZi/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0012
  list(file = "https://sid.erda.dk/share_redirect/hT3CftfSyw/reports/by_step/preprocess_data/multiqc_fastqc.txt", column_name = "Total Sequences"), # MSEB0015
  list(file = "https://sid.erda.dk/share_redirect/B0E8AbA7Eu/reports/by_step/preprocess_data/samtools-flagstat-dp_Read_counts.txt", column_name = "Total Reads") # MSEB0014
)

3.2.1.3 Percentage (%) of host and and of human mapped reads.

NB! This % is calculated on the trimmed reads. NB! The reads are mapped to 3 databases (human, chicken, pig) sequentially, so the % is after removing reads mapped to the previous db.

host_human_mapping_files <- list(
  list(file = "https://sid.erda.dk/share_redirect/G2guEHWh9v/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0006
  list(file = "https://sid.erda.dk/share_redirect/HiPNk7p4MG/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0009
  list(file = "https://sid.erda.dk/share_redirect/cdU6P6sNuj/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0010
  list(file = "https://sid.erda.dk/share_redirect/EUKYidpvOO/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0011
  list(file = "https://sid.erda.dk/share_redirect/dEy2D1OmZi/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0012
  list(file = "https://sid.erda.dk/share_redirect/hT3CftfSyw/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt", column_name = "mapped_passed_pct"), # MSEB0015
  list(file = "https://sid.erda.dk/share_redirect/B0E8AbA7Eu/reports/by_step/preprocess_data/samtools-flagstat-dp_Percentage_of_total.txt", column_name = "Properly Paired") # MSEB0014
)

3.2.1.4 Number and percentage (%) of bacteria mapped reads.

NB! This % is calculated on the trimmed reads after filtering for human, chicken, and pig reads. ‘Non mapped reads’ at this point are trimmed but not mapped to human, chicken, pig, or bacterial MAG catalogue.

bacteria_mapping_files <- c(
  "https://sid.erda.dk/share_redirect/G2guEHWh9v/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0006
  "https://sid.erda.dk/share_redirect/HiPNk7p4MG/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0009
  "https://sid.erda.dk/share_redirect/cdU6P6sNuj/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0010
  "https://sid.erda.dk/share_redirect/EUKYidpvOO/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0011
  "https://sid.erda.dk/share_redirect/dEy2D1OmZi/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0012
  "https://sid.erda.dk/share_redirect/B0E8AbA7Eu/reports/by_step/quantify_data/multiqc_samtools_stats.txt", # MSEB0014
  "https://sid.erda.dk/share_redirect/hT3CftfSyw/reports/by_step/quantify_data/multiqc_samtools_stats.txt" # MSEB0015
)

3.2.2 Define functions to load the statistics.

At this step we choose the relevant rows and columns from the bioinformatics output.

3.2.2.1 General sequencing statistics of sequencing.

multiqc_fastqc_read_and_select <- function(file) {
  read_tsv(file,
    col_types = cols_only(
      "Sample" = col_character(),
      "Total Sequences" = col_double(),
      "%GC" = col_double(),
      "total_deduplicated_percentage" = col_double()
    ),
    show_col_types = FALSE
  ) %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(
      microsample = Sample,
      total_sequences = `Total Sequences`,
      percent_gc = `%GC`,
      percent_unique = total_deduplicated_percentage
    ) %>%
    select(microsample, total_sequences, percent_gc, percent_unique)
}

3.2.2.2 General sequencing statistics of sequencing after trimming.

trimmed_multiqc_fastqc_read_and_select <- function(file_info) {
  file <- file_info$file  # Extract the file path
  column_name <- file_info$column_name  # Extract the column name
  read_tsv(file,
    show_col_types = FALSE
  ) %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(
      microsample = Sample,
      total_trimmed_sequences = !!sym(column_name)
    ) %>%
    select(microsample, total_trimmed_sequences)
}

3.2.2.3 Percentage (%) of host and and of human mapped reads.

host_human_mapping_process_file <- function(file, column_name) {
  read_tsv(file, show_col_types = FALSE) %>%
    mutate(reference = case_when(
      grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
      grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
      TRUE ~ NA_character_
    )) %>%
    filter(!is.na(reference)) %>% # Remove rows where reference is NA
    mutate(
      microsample = str_extract(Sample, "M\\d+"),
      reads_mapped_host_percent = ifelse(reference == "chicken", !!sym(column_name), NA_real_),
      reads_mapped_human_percent = ifelse(reference == "human", !!sym(column_name), NA_real_)
    ) %>%
    select(microsample, reads_mapped_host_percent, reads_mapped_human_percent)
}

3.2.2.4 Number and percentage (%) of bacteria mapped reads.

bacteria_mapping_process_file <- function(file) {
  read_tsv(file, show_col_types = FALSE) %>%
    filter(str_detect(Sample, "mgg-pbdrep")) %>% #select samples mapped to mgg-pdrep database (i.e. NO 'salmonella' or 'chicken big mag')
    mutate(
      microsample = str_extract(Sample, "M\\d+"),
      reads_mapped_bacteria = reads_mapped,
      reads_mapped_bacteria_percent = reads_mapped_percent
    ) %>%
    select(microsample, reads_mapped_bacteria, reads_mapped_bacteria_percent)
}

3.2.3 Load the statistics.

Load each file with the functions above (to select the relevant columns/rows). Group by ‘microsample’ & estimate sums/means/etc. because in some files each microsample is split into two rows. Estimate the quality score of each sample, and add as a new column. This is optional and might be removed. Add an ‘others’, i.e. not mapped reads as a new column. Select which columns to include in the final object.

final_combined_stats <- bind_rows(
  lapply(multiqc_fastqc_list, multiqc_fastqc_read_and_select),# Process FastQC stats files
  lapply(multiqc_fastqc_trimmed_list, trimmed_multiqc_fastqc_read_and_select),# Process FastQC stats files after trimming
  lapply(host_human_mapping_files, function(x) { # Process host and human mapping files
    host_human_mapping_process_file(x$file, x$column_name)
  }),
  lapply(bacteria_mapping_files, bacteria_mapping_process_file) # Process bacterial mapping files
) %>%
  group_by(microsample) %>% # because there are two rows per sample in the multi_fastqc files. 
  summarise(
    total_sequences = sum(total_sequences, na.rm = TRUE), # sum the no. of sequences in the two rows of each sample
    total_trimmed_sequences = sum(total_trimmed_sequences, na.rm = TRUE),
    percent_gc = mean(percent_gc, na.rm = TRUE), # mean of GC% for the two rows. Only works because the no.of sequences is the same in the two rows.
    percent_unique = mean(percent_unique, na.rm = TRUE), # mean of unique% for the two rows. Only works because the no.of sequences is the same in the two rows.
    reads_mapped_host_percent = mean(reads_mapped_host_percent, na.rm = TRUE), # only one value here, so not actual mean
    reads_mapped_human_percent = mean(reads_mapped_human_percent, na.rm = TRUE), # only one value here, so not actual mean
    reads_mapped_bacteria = sum(reads_mapped_bacteria, na.rm = TRUE), # only one value here, so not actual sum
    reads_mapped_bacteria_percent = mean(reads_mapped_bacteria_percent, na.rm = TRUE) # only one value here, so not actual sum
  ) %>%
  # estimate quality score of each sample
  mutate(
    depth = ifelse(total_sequences > 10000000, 1, 0),
    duplicates = ifelse(percent_unique > 35, 1, 0),
    gc = ifelse(percent_gc < 60, 1, 0),
    human = ifelse(reads_mapped_human_percent < 5, 1, 0),
    bacteria = ifelse(reads_mapped_bacteria_percent > 75, 1, 0),
    quality = depth + duplicates + gc + human + bacteria,
    reads_mapped_other_percent = 100 - (reads_mapped_bacteria_percent) # calculate 'other' reads percentage - the bacterial % is from the trimmed & filtered reads already
  ) %>%
  select(microsample, total_sequences, total_trimmed_sequences, percent_gc, percent_unique, 
         reads_mapped_host_percent, reads_mapped_human_percent, 
         reads_mapped_bacteria, reads_mapped_bacteria_percent, 
         reads_mapped_other_percent, quality)

3.2.4 Write the final stats dataframe to a TSV file

final_combined_stats %>% write_tsv("results/final_combined_stats.tsv")
#print(final_combined_stats)

3.3 Data plotting

3.3.1 Define lists that contain the settings for plotting each statistic.

3.3.1.1 Sequencing depth

This is the total number of sequenced reads.

stat_params_total_sequences <- list(
  x_var = "total_sequences",
  x_label = "Number of reads",
  x_vline = 10000000,
  stacked = FALSE
)

3.3.1.2 Number of sequences after trimming

This is the total number of sequenced reads after trimming the adaptors and low quality sequences.

stat_params_trimmed_sequences <- list(
  x_var = "total_trimmed_sequences",
  x_label = "Number of trimmed reads",
  x_vline = NULL,
  stacked = FALSE
)

3.3.1.3 Number of trimmed sequences

This is the difference between total reads and reads after trimming.

prepare_stacked_data <- function(data) {
  data %>%
    mutate(trimmed_reads = total_sequences - total_trimmed_sequences) %>%
    pivot_longer(cols = c(total_trimmed_sequences, trimmed_reads),
                 names_to = "read_type", values_to = "reads") %>%
    mutate(read_type = factor(read_type, levels = c("trimmed_reads", "total_trimmed_sequences")))
}

stat_params_compare_sequences <- list(
  x_var = "total_trimmed_sequences",
  x_label = "Number of trimmed reads",
  x_vline = NULL,
  stacked = TRUE
)

3.3.1.4 Percentage (%) of unique sequences

stat_params_unique <- list(
  x_var = "percent_unique",
  x_label = "% of unique sequences",
  x_vline = 35,
  stacked = FALSE
)

3.3.1.5 Percentage (%) of GC content

stat_params_gc <- list(
  x_var = "percent_gc",
  x_label = "% of GC content",
  x_vline = 60,
  stacked = FALSE
)

3.3.1.6 Percentage (%) of host reads

stat_params_host_reads <- list(
  x_var = "reads_mapped_host_percent",
  x_label = "% of host reads",
  x_vline = NULL,
  stacked = FALSE
)

3.3.1.7 Percentage (%) of human reads

stat_params_human_reads <- list(
  x_var = "reads_mapped_human_percent",
  x_label = "% of human reads",
  x_vline = 5,
  stacked = FALSE
)

3.3.1.8 Percentage (%) of bacterial reads

NB! In the next iteration, it is better to do this by using the counts dataset instead of the statistics file.

stat_params_bacteria_reads <- list(
  x_var = "reads_mapped_bacteria_percent",
  x_label = "% of bacteria reads",
  x_vline = 75,
  stacked = FALSE
)

3.3.1.9 % of unmapped reads

stat_params_other_reads <- list(
  x_var = "reads_mapped_other_percent",
  x_label = "% of other reads",
  x_vline = 25,
  stacked = FALSE
)

3.3.1.10 Quality score

stat_quality_score <- list(
  x_var = "quality",
  x_label = "Quality score",
  x_vline = 5,
  stacked = FALSE
)

3.3.2 Define a list of all the statistics settings that you want to plot.

stat_params_list <- list(
  stat_total_sequences = stat_params_total_sequences,
  stat_trimmed_sequences = stat_params_trimmed_sequences,
  stat_compare_sequences = stat_params_compare_sequences,
  stat_unique = stat_params_unique,
  stat_gc = stat_params_gc,
  stat_host = stat_params_host_reads,
  stat_human = stat_params_human_reads,
  stat_bacteria = stat_params_bacteria_reads,
  stat_other = stat_params_other_reads,
  stat_quality = stat_quality_score
)

3.3.3 Define lists that contain the plot settings for different experiments/trials

3.3.3.1 B11 vs B12 lysis buffers

Compare buffer B11 and B12. Use batches MSEB0006 (caecum) and MSEB0010 (colon), from the focal (adult) chicken. For the colon, use the samples that took 15 PCR cycles instead of 19 (due to the latter’s low quality).

plot_params_buffers <- list(
  filter_conditions = list(
    quote(section != "Ileum"),
    quote(cycles < 16),
    quote(batch == "MSEB0006" | batch == "MSEB0010")
  ),
  labels_title = "Lysis Buffer",
  facet_formula = "section + type + buffer ~ .", #"batch + section + type ~ ."
  scale_fill_manual_val = c('#ffdf9e','#ffc273'), # '#a3d1cf','#d1a3cf'
  fill_var = "buffer",
  plot_title = "Lysis Buffer trial"
)

3.3.3.2 15 vs 19 PCR cycles

Use the colon samples (MSEB0010). Maybe separate by buffer??

plot_params_cycles <- list(
  filter_conditions = list(
    quote(batch == "MSEB0010")
  ),
  labels_title = "PCR cycles",
  facet_formula = "section + type + cycles ~ .", # "batch + section + type ~ ."
  scale_fill_manual_val = c('#ffc273','#e56969'),
  fill_var = "factor(cycles)",
  plot_title = "PCR cycles trial"
)

3.3.3.3 Limit of detection trial: Different LMD sizes

Use batch MSEB0014 (caecum).

plot_params_LOD <- list(
  filter_conditions = list(
    quote(batch == "MSEB0014")
  ),
  labels_title = "LMD size",
  facet_formula = "type + size ~ .", #"batch + section + type + cryosection ~ ."
  scale_fill_manual_val = c('#ffdf9e','#ffc273','#e56969','#c1558b','#8a49a1','#4f5bd5'),
  fill_var = "factor(size, levels = c(500, 1500, 2500, 5000, 25000, 50000))",
  plot_title = "Limit of detection (LMD size)"
)

3.3.3.4 Automation trial

Compare the quality of library prep with DreamPrep (MSEB0015) vs manual (MSEB0011) for ceacum of focal chicken

plot_params_automation <- list(
  filter_conditions = list(
    quote(batch == "MSEB0011"|batch == "MSEB0015"), 
    quote(animal == 'G121e')
  ),
  labels_title = "Automation",
  facet_formula = "batch + type + cryosection ~ .", #"batch + section + type + cryosection ~ ."
  scale_fill_manual_val = c('#e56969','#c1558b'),
  fill_var = "batch",
  plot_title = "Automation test"
)

3.3.3.5 Full vs. half reaction (library prep with UltraLowV2 Tecan kit)

Compare the quality of library prep with full reaction (MSEB0006, MSEB0009, MSEB0010) vs half reaction (MSEB0011, MSEB0012) of focal chicken, ceacum and colon (only low PCR cycles). NB! both buffers.

plot_params_protocol <- list(
  filter_conditions = list(
    quote(section != "Ileum"), 
    quote(batch != "MSEB0014"& batch != "MSEB0015"),
    quote(animal == 'G121e'),
    quote(cycles<16)
  ),
  labels_title = "Protocol",
  facet_formula = "type + section + protocol ~ .", #"type + section + batch ~ ."
  scale_fill_manual_val = c('#c1558b','#8a49a1'),
  fill_var = "protocol",
  plot_title = "Full vs. half reactions"
)

3.3.3.6 Ceacum vs colon

Compare the quality of colon vs caecum samples of the focal chicken (and only low PCR cycles)

plot_params_section <- list(
  filter_conditions = list(
    quote(section != "Ileum"), 
    quote(batch == "MSEB0009"|batch == "MSEB0010"|batch == "MSEB0011"|batch == "MSEB0012"),
    quote(animal == 'G121e'),
    quote(cycles<16)
  ),
  labels_title = "Section",
  facet_formula = "type + section ~ .", #"type+ batch ~ ."
  scale_fill_manual_val = c('#8a49a1','#4f5bd5'),
  fill_var = "section",
  plot_title = "Caecum vs colon"
)

3.3.3.7 Adult vs young chicken

Compare the quality of samples from the focal (adult) chicken vs the younger chicken, for both colon (MSEB0012) and caecum (MSEB0011).

plot_params_animal <- list(
  filter_conditions = list(
    quote(batch == "MSEB0011"|batch == "MSEB0012") 
  ),
  labels_title = "Animal",
  facet_formula = "type + section + animal ~ .", #"type+ batch + section + animal ~ ."
  scale_fill_manual_val = c('#ffc273','#c1558b'),
  fill_var = "animal",
  plot_title = "Adult vs young chicken"
)

3.3.3.8 LMD collection attemps

Compare the quality of samples coloured by the number of attempts to collect the LMD sample. LOD trial excluded.

plot_params_collection_attempts <- list(
  filter_conditions = list(
    quote(section != "Ileum"), 
    quote(batch != "MSEB0014"),
    quote(animal == 'G121e'),
    quote(cycles<16),
    quote(collection_attempts>0)
  ),
  labels_title = "Collection attempts",
  facet_formula = "type + section + collection_attempts ~ .", #"type + section + batch ~ ."
  scale_fill_manual_val = c('#ffdf9e','#ffc273','#e56969','#c1558b','#8a49a1','#4f5bd5'),
  fill_var = "factor(collection_attempts)",
  plot_title = "LMD collection attempts"
)

3.3.3.9 LMD collection success

Compare the quality of samples coloured by the LMD success jugded upon visual inspection of the collection lids. LOD trial excluded.

plot_params_collection_success <- list(
  filter_conditions = list(
    quote(section != "Ileum"), 
    quote(batch != "MSEB0014"),
    quote(animal == 'G121e'),
    quote(cycles<16),
    quote(collection_attempts>0)
  ),
  labels_title = "Collection_success",
  facet_formula = "type + section + collection ~ .", #"type + section + batch ~ ."
  scale_fill_manual_val = c('#ffc273','#e56969','#c1558b','#8a49a1','#4f5bd5'),
  fill_var = "collection",
  plot_title = "LMD collection success"
)

3.3.4 Define a list of all the experiments/trials settings that you want to plot.

plot_params_list <- list(
  plot_buffers = plot_params_buffers, 
  plot_cycles = plot_params_cycles,
  plot_LOD = plot_params_LOD,
  plot_automation = plot_params_automation,
  plot_protocol = plot_params_protocol,
  plot_section = plot_params_section,
  plot_animal = plot_params_animal,
  plot_collection_attempts = plot_params_collection_attempts,
  plot_collection_success = plot_params_collection_success
)

3.3.5 Define barplot function

First, define the plotting settings.

# Define a custom theme for your taxonomy plots
custom_ggplot_theme <- theme(
  strip.text.y.left = element_text(angle = 0),
  strip.text.y.right = element_text(angle = 0),
  axis.text = element_text(size = 6),
  axis.title = element_text(size = 12, face = "bold"),
  strip.background = element_rect(fill = "#dde3e9", color = "white", size = 0.8),  # Custom facet strip background
  strip.text = element_text(size = 8, face = "bold", color = "black"),  # Custom facet text
  strip.placement = "outside",  # Place strip outside the panel grid
  panel.spacing = unit(0.1, "lines"),  # Adjust space between panels
  panel.grid.major = element_line(color = "#dde3e9"),  # Customize major grid lines
  panel.grid.minor = element_blank(),  # Remove minor grid lines 
  panel.background = element_rect(fill = "white"),  # Change panel background color
  plot.margin = unit(c(1, 1, 1, 1), "cm")  # Adjust plot margins to ensure content fits
)

This function can be used for plotting different statistics (see stat_params list) and different experiments (see plot_params list).

plot_data <- function(data, metadata, plot_params, stat_params, bar_width = 0.9) {
  # Merge the data with metadata
  plot_data <- data %>%
    left_join(metadata, by = join_by(microsample == microsample)) 
  
  # Apply filters if provided
  if (length(plot_params$filter_conditions) > 0) {
    plot_data <- plot_data %>% filter(!!!plot_params$filter_conditions)
  }
  
  # Preprocess data if stacked plot is needed
  if (stat_params$stacked) {
    plot_data <- prepare_stacked_data(plot_data)
  }
  
    # Conditionally apply factor() for size based on the presence of 'size' in plot_params$fill_var
  if (grepl("size", plot_params$fill_var)) {
    plot_data <- plot_data %>%
      mutate(size = factor(size, levels = c(500, 1500, 2500, 5000, 25000, 50000)))
  }

  # Calculate plot height dynamically based on number of microsamples
  plot_height <- 5 + (nrow(plot_data) * 0.2) #10 + (nrow(plot_data) * 0.01)

  # Create the ggplot object
  p <- ggplot(plot_data) +
    { if (stat_params$stacked) {
      # Plot stacked bars if stacked is TRUE with fixed bar width
      geom_col(aes(x = reads, y = microsample, fill = read_type), position = "stack", width = bar_width)
    } else {
      # Plot normal bars if stacked is FALSE with fixed bar width
      geom_col(aes_string(x = stat_params$x_var, y = "microsample", fill = plot_params$fill_var, width = bar_width))
    }} +
    scale_fill_manual(values = if (stat_params$stacked) {
      # Use different shades for stacked bars
      c("total_trimmed_sequences" = plot_params$scale_fill_manual_val[1],
        "trimmed_reads" = scales::muted(plot_params$scale_fill_manual_val[1]))
    } else {
      # Use specified colors for non-stacked plots
      plot_params$scale_fill_manual_val
    }) +
    facet_nested(as.formula(plot_params$facet_formula), scales = "free", space = "free", switch = "y") +
    custom_ggplot_theme +
    labs(x = stat_params$x_label, 
         y = "Microsamples", 
         fill = plot_params$labels_title,
         title = plot_params$plot_title
    ) +
    coord_cartesian(clip = "off")
  
  # Add geom_vline if x_vline is provided
  if (!is.null(stat_params$x_vline)) {
    p <- p + geom_vline(xintercept = stat_params$x_vline, linetype = "dashed", color = "#1f2455", size = 0.3)
  }

  # Return plot and calculated height
  return(list(plot = p, height = plot_height))
}

3.4 Plot figures for all the statistics and all experiments.

NB! This function also saves the plots in a file. Comment out if you don’t want that.

# Initialize a list to store plots #fig.height=10, fig.fullwidth=TRUE, fig.height=(5 + (nrow(plot_data) * 0.2)/ 2.54
plots_list <- list()

# Loop through each combination of plot_params and stat_params
for (plot_param_name in names(plot_params_list)) {
  plot_params <- plot_params_list[[plot_param_name]]
  
  for (stat_param_name in names(stat_params_list)) {
    stat_params <- stat_params_list[[stat_param_name]]
    
    # Generate the plot with dynamic height
    result <- plot_data(final_combined_stats, sample_metadata, plot_params, stat_params)
    plot <- result$plot
    plot_height <- result$height
    
    # Create a dynamic plot name
    plot_name <- paste0(plot_param_name, "_", stat_param_name)
    
    # Store the plot in the list
    plots_list[[plot_name]] <- plot
    
    # Print the plot
    print(plot)
    
    # Save the plot to a file with dynamic height
    ggsave(filename = paste0("results/figures/statistics/", plot_name, ".jpg"), 
           plot = plot, 
           device = "jpg", 
           width = 30, 
           height = plot_height, 
           units = "cm", 
           dpi = 300, 
           limitsize = FALSE)
  }
}